{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ce7ed168",
   "metadata": {},
   "source": [
    "### Generate .bed file with inactive genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "33216e00",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path \n",
    "import pandas as pd\n",
    "import pysam "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8879949a",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path=Path(wkdir)\n",
    "\n",
    "inactive_genes_pass_qc_path=wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt\")\n",
    "gencode_gtf_path=wkdir_path.joinpath(\"reference/gencode/gencode.v31.annotation.sorted.gtf.gz\")\n",
    "# output \n",
    "out_dir = wkdir_path.joinpath(\"3_misexp_genes/bed_files\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5bd3213e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes passing QC: 8650\n"
     ]
    }
   ],
   "source": [
    "inactive_genes_pass_qc = set(pd.read_csv(inactive_genes_pass_qc_path, sep=\"\\t\", header=None)[0])\n",
    "print(f\"Number of inactive genes passing QC: {len(inactive_genes_pass_qc)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9b360f32",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes in bed file: 8650\n"
     ]
    }
   ],
   "source": [
    "inactive_genes_bed_path = out_dir.joinpath(\"inactive_genes_phylop.bed\")\n",
    "gene_id_to_bed = []\n",
    "with open(inactive_genes_bed_path, \"w\") as f:\n",
    "    for gtf in pysam.TabixFile(str(gencode_gtf_path)).fetch(parser = pysam.asGTF()):\n",
    "        if gtf.feature == \"gene\" and gtf.gene_id.split(\".\")[0] in inactive_genes_pass_qc:\n",
    "            chrom, start, end, gene_id = gtf.contig, gtf.start, gtf.end, gtf.gene_id\n",
    "            gene_id = gene_id.split(\".\")[0]\n",
    "            gene_id_to_bed.append(gene_id.split(\".\")[0])\n",
    "            f.write(f\"{chrom}\\t{start}\\t{end}\\t{gene_id}\\n\")\n",
    "print(f\"Number of genes in bed file: {len(gene_id_to_bed)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb80febc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
